import pandas as pd
import numpy as np
import math
import json
import pickle
import plotly.graph_objects as go
from datetime import date
from tqdm import tqdm
# read in the json files
portfolio = pd.read_json('Starbucks_Offers/data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('Starbucks_Offers/data/profile.json', orient='records', lines=True)
transcript = pd.read_json('Starbucks_Offers/data/transcript.json', orient='records', lines=True)
profile.head()
sum(pd.isnull(profile['income']))
profile = profile.dropna()
profile['became_member_on'] = profile['became_member_on'].apply(lambda x: '/'.join([str(x)[:4], str(x)[4:6], str(x)[6:]]))
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'])
profile.head()
transcript['event'].unique()
transcript['value'] = transcript['value'].apply(lambda x: list(x.values())[0])
transcript.head()
transcript.shape
np.sum(transcript.duplicated())
transcript = transcript.drop_duplicates()
transcript['personvalue'] = transcript['person'].astype(str) + '-' + transcript['value'].astype(str)
transcript.head()
An offer which leads to a transaction is an offer that has been first viewed and then completed.
It can happen that an offer is being completed without the offer being viewed or before the offer being viewed.
For each person, we will then look at
time_viewed: the first time when the person has viewed the author
time_completed: the time when this person has completed the offer (the highest time if the offer hase been completed several times)
I time_completed > time_viewed, then we can consider that this offer has effectively led to a transaction.
As the following cell takes time running, we will comment it after having saved the output in a pickle file.
# offer_viewed = transcript[transcript['event'] == 'offer viewed']['personvalue']
# offer_viewed_and_completed = transcript[(transcript['personvalue'].isin(offer_viewed))
# & (transcript['event'] == 'offer completed')]
# offers_leading_to_transaction = []
# edge_cases = []
# for offer in tqdm(offer_viewed_and_completed.sort_values('person')['personvalue'].unique()):
# # focus on the transcript for one offer for a user
# offer_timing = transcript[transcript['personvalue'] == offer].sort_values('time')
# # selecting the first time the offer was viewed
# time_viewed = offer_timing[offer_timing['event'] == 'offer viewed']['time'].iloc[0]
# # selecting the last time the offer was completed
# time_completed = offer_timing[offer_timing['event'] == 'offer completed']['time'].iloc[-1]
# if time_completed >= time_viewed:
# offers_leading_to_transaction.append(offer)
# else:
# edge_cases.append(offer)
# pickle.dump(offers_leading_to_transaction, open('Starbucks_Offers/offer_leading_to_transaction', 'wb'))
# pickle.dump(edge_cases, open('Starbucks_Offers/edge_cases', 'wb'))
offers_leading_to_transaction = pickle.load(open('Starbucks_Offers/offer_leading_to_transaction', 'rb'))
edge_cases = pickle.load(open('Starbucks_Offers/edge_cases', 'rb'))
Out of the offer received, percentage of offers that led effectively led to a transaction:
len(offers_leading_to_transaction)/len(transcript[transcript['event'] == 'offer received'])
Out of the offer completed and viewed, percentage of offers viewed after having been completed
len(edge_cases)/(len(offers_leading_to_transaction) + len(edge_cases))
types_leading_to_buy = transcript[
transcript['personvalue'].isin(offers_leading_to_transaction)
]['value'].value_counts(normalize=True).reset_index()
types_leading_to_buy.columns = ['id', 'share_of_transactions']
types_leading_to_buy = portfolio.merge(types_leading_to_buy,
on='id',
how='outer').sort_values(by='share_of_transactions',
ascending=False)
overview_types = types_leading_to_buy.groupby('offer_type')['share_of_transactions'].sum()
fig = go.Figure()
fig.add_trace(go.Pie(labels=overview_types.index,
values=overview_types))
fig.update_layout({'width':500, 'height':500})
fig.show()
Overall, BOGO offers and discounts are responsible for the same number of transactions.
types_leading_to_buy
What we can see here is that the discount offers of low difficulty and shared on all the channels, including social, tend to lead more to transactions than other discount offer that are more difficult to complete and shared through less channels.
Between them, are the BOGO offer. It seems that a BOGO offer will work better if the difficulty is low and if it is shared on social.
It is quite interesting to note that the reward does not make an offer more attractive. What seems to impact the most is the amount the person need to spend to get it as well as the place where the offer was shared.
For the following of the study, we will name each offer for more convenience. We will use the ranking of each offer and their type to name them.
We will leave the informational as they are because they won't be part of the offers leading to transaction.
types_leading_to_buy['offer_name'] = [
'discount_1',
'discount_2',
'bogo_1',
'bogo_2',
'bogo_3',
'bogo_4',
'discount_3',
'discount_4',
'informational',
'informational'
]
transaction_offer_completed = transcript[
(transcript['personvalue'].isin(offers_leading_to_transaction))
&(transcript['event'] == 'offer completed')][['person', 'time', 'value']
]
transaction_offer_completed.head()
transactions = transcript[transcript['event'] == 'transaction'][['person', 'time', 'value']]
revenue_per_offer = transaction_offer_completed.merge(transactions, how='left', left_on=['person','time'], right_on = ['person','time'])
revenue_per_offer.head()
revenue_per_type = revenue_per_offer.groupby('value_x')['value_y'].sum().reset_index()
revenue_per_type.columns = ['id', 'revenue']
revenue_per_type['revenue_share'] = revenue_per_type['revenue'] / revenue_per_type['revenue'].sum()
revenue_per_type = revenue_per_type.drop('revenue', axis=1)
types_leading_to_buy.merge(revenue_per_type,
on='id',
how='outer').sort_values('revenue_share', ascending=False)
The revenue led by each offers follows the same ranking as the number of transactions which means that an offer might not lead to spend always more than another one.
focus_transcript = transcript[(transcript['personvalue'].isin(offers_leading_to_transaction))
& (transcript['event'] == 'offer completed')]
sum(focus_transcript[['person', 'value']].duplicated())
There are 3866 times were the same person uses the offer several times. We will leave these duplicated lines as they provide us with the info of a specific offer working on certain types of person (even though we hope at the end winning this consumer as somebody that buys without any offer).
offers_and_persons = focus_transcript[['person', 'value']].merge(types_leading_to_buy[['id', 'offer_name']],
how='left',
left_on='value',
right_on='id')
offers_and_persons = offers_and_persons.drop(['id', 'value'], axis=1)
offers_and_persons
Now let's use the profiling of users to understand better which user will use which offer.
offers_and_persons = offers_and_persons.merge(profile, how='left', left_on='person', right_on='id')
offers_and_persons
sum(pd.isnull(offers_and_persons['gender']))
# Removing Nan values
offers_and_persons = offers_and_persons[~pd.isnull(offers_and_persons['gender'])]
gender_overall = offers_and_persons['gender'].value_counts(normalize=True)
fig = go.Figure()
fig.add_trace(go.Pie(values=gender_overall,
labels=gender_overall.index))
fig.update_layout(width=500, height=500)
fig.show()
In the transactions we are looking at, we have almost the same number of males and females. Let's look at this stat per offer.
gender_per_offer = offers_and_persons.groupby(['offer_name', 'gender'])['id'].count().reset_index()
gender_per_offer = gender_per_offer.merge(gender_per_offer.groupby('offer_name')['id'].sum(),
how='left',
on='offer_name')
gender_per_offer['id_x'] = gender_per_offer['id_x']/gender_per_offer['id_y']
gender_per_offer.head()
fig = go.Figure()
for gender in ['F', 'M', 'O']:
fig.add_trace(go.Bar(
x=gender_per_offer['offer_name'].unique(),
y=gender_per_offer[(gender_per_offer['gender'] == gender)]['id_x'], name=gender))
fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})
fig.show()
The share of women seems to be slightly higher for BOGO. The share of men seems to be slightly higher for discounts.
Let's look at the age distribution of the persons of interest.
fig = go.Figure()
fig.add_trace(go.Histogram(x=offers_and_persons['age'], histnorm='percent', nbinsx=10))
fig.show()
We can see that most of the persons are around their 50s. Let's see if a certain offer is more popular depending on the age.
age_per_offer = offers_and_persons.groupby('offer_name')['age'].mean()
age_per_offer
It is hard to tell by looking at the average age per offer if there is a link between the age and the offer which leads to a transaction.
sum(pd.isnull(offers_and_persons['income']))
fig = go.Figure()
fig.add_trace(go.Histogram(x=offers_and_persons['income'], histnorm='percent'))#, nbinsx=10))
fig.show()
The income distribution does not look like to be following a Normal low (contrary to the age).
Let's see if we can see a link between the offer and the income of a person.
income_per_offer = offers_and_persons.groupby('offer_name')['income'].median()
fig = go.Figure()
fig.add_trace(go.Bar(x=income_per_offer.index,
y=income_per_offer))
fig.show()
Slightly higher income will complete the BOGO offer, which can make sense.
Let's transform this variable so that it translate the time lapse between the day when the person has become a member and today.
ref_date = offers_and_persons['became_member_on'].max()
ref_date
offers_and_persons['seniority'] = offers_and_persons['became_member_on'].apply(lambda x:(ref_date - x).days)
offers_and_persons.head()
fig = go.Figure()
fig.add_trace(go.Histogram(x=offers_and_persons['seniority']))
fig.show()
It is clear that the more recent a client is, the more likely he will use an offer.
no_offer_and_person = transcript[(~transcript['personvalue'].isin(offers_leading_to_transaction))
& (transcript['event'] == 'transaction')]
no_offer_and_person = no_offer_and_person[['person', 'value']].merge(profile, how='left', left_on='person', right_on='id')
no_offer_and_person
sum(pd.isnull(no_offer_and_person['gender'])) / len(no_offer_and_person)
no_offer_and_person = no_offer_and_person[~pd.isnull(no_offer_and_person['gender'])]
fig = go.Figure()
fig.add_trace(go.Pie(values=no_offer_and_person['gender'].value_counts(normalize=True),
labels=no_offer_and_person['gender'].value_counts(normalize=True).index))
fig.show()
There are more males that complete a transaction without seeing any offer.
fig = go.Figure()
fig.add_trace(go.Histogram(x=no_offer_and_person['age'], histnorm='percent', nbinsx=10))
fig.show()
The age distribution looks the same as with the offers.
fig = go.Figure()
fig.add_trace(go.Histogram(x=no_offer_and_person['income'], histnorm='percent', name='no offer', nbinsx=20))
fig.add_trace(go.Histogram(x=offers_and_persons['income'], histnorm='percent', name='offer', marker=dict(color='gray'), nbinsx=20))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()
People that don't use an offer don't have a higher income when looking at both distributions.
Still, those two distributions are quite different.
no_offer_and_person['seniority'] = no_offer_and_person['became_member_on'].apply(lambda x:(ref_date - x).days)
fig = go.Figure()
fig.add_trace(go.Histogram(x=no_offer_and_person['seniority'], histnorm='percent', name='no offer', nbinsx=20))
fig.add_trace(go.Histogram(x=offers_and_persons['seniority'], histnorm='percent', name='offer', marker=dict(color='gray'), nbinsx=20))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()
People using an offer or not have the same distribution of date if membership.
As the no-offer profile is interesting to exploit, we will add an offer call 'no_offer' which will mean that for this type of person, we should not show any offer.
We will thus add these profile to our main datafram offers_and_person.
no_offer_and_person['offer_name'] = 'no_offer'
no_offer_and_person.head()
offers_and_persons.head()
cols_to_keep = ['person', 'offer_name', 'gender', 'age', 'seniority']
df = pd.concat([offers_and_persons[cols_to_keep],
no_offer_and_person[cols_to_keep]],
axis=0,
sort=False)
df['offer_name'].value_counts(normalize=True)
We notice that when we add the no offer type, we result in an imbalance dataset. We will then exclude them for now and limit ourselves to the offers_and_persons dataset.
df = offers_and_persons[['offer_name', 'gender', 'age', 'income', 'seniority']]
df.head()
Let's one hot encode the gender column.
df = pd.concat([df.drop('gender', axis=1),
pd.get_dummies(df['gender'], prefix='gender', prefix_sep='_', drop_first=True)],
axis=1,
sort=False)
df.head()
df['offer_name'].value_counts(normalize=True)
pd.isnull(df).sum()
df.to_csv('clean_data.csv')
from subprocess import call
call(['python', '-m', 'nbconvert', 'explore_and_clean.ipynb'])